from pyspark.sql import SparkSession
import pyspark.sql.functions as f
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import folium
import folium.plugins
import seaborn as sns
import json
import re
%matplotlib inline
plt.rcParams['figure.figsize'] = 10, 10
spark = (SparkSession.builder
.master("local[*]")
.getOrCreate())
spark.version
DATA_FOLDER = '/home/jovyan/work/data/housing'
houses = spark.read.parquet(DATA_FOLDER + '/hemnet').cache()
houses.filter(f.lower(houses.addres).like('%ankargatan%')).limit(5).toPandas()
houses.printSchema()
houses.groupBy(f.substring(houses.soldDate, 0, 4).alias('year')).count().orderBy(f.col('year')).show()
houses = houses.filter(f.substring(houses.soldDate, 0, 4) == '2016')
houses.groupBy(houses.county).count().orderBy(f.col('count').desc()).show()
def plot(houses):
positions = (houses
.rdd.map(lambda row : (row.lat, row.lon))
).collect()
m = folium.Map(
location=[60.128161,18.643501],
zoom_start=5,
control_scale = False,
tiles='stamentoner',
attr='USGS style'
)
m.add_child(folium.plugins.HeatMap(positions, radius=10))
return m
plot(houses)
def plot_circles(houses, threshold, lon=18.643501, lat=60.128161, zoom=5):
houses = houses.withColumn('ppsm_rel', houses.ppsm/threshold)
data = (houses
.filter(houses.ppsm.isNotNull())
.rdd.map(lambda row : (row.lat, row.lon, row.ppsm_rel))
).collect()
data1 = map(lambda (lon, lat, radius) : ((lon,lat), radius), data)
cmap = plt.get_cmap('coolwarm')
m = folium.Map(
location=[lat,lon],
zoom_start=zoom,
control_scale = False,
tiles='stamentoner',
attr='USGS style'
)
for pos, rad in data1:
color = matplotlib.colors.to_hex(cmap(rad))
m.add_child(folium.CircleMarker(pos, radius=3, color=None, fill_color=color, fill_opacity=0.7))
return m
plot_circles(houses.filter(~houses.county.isin(['Stockholm','Göteborg','Malmö'])).sample(False,0.2), 50000)
plot_circles(houses.filter(houses.county == 'Malmö'), 50000, 13.010559, 55.569416, 11.5)